| @@ -14,6 +14,46 @@ module WebRequestConcern | ||
| 14 | 14 | end | 
| 15 | 15 | end | 
| 16 | 16 |  | 
| 17 | + class CharacterEncoding < Faraday::Middleware | |
| 18 | + def initialize(app, force_encoding: nil, default_encoding: nil, unzip: nil) | |
| 19 | + super(app) | |
| 20 | + @force_encoding = force_encoding | |
| 21 | + @default_encoding = default_encoding | |
| 22 | + @unzip = unzip | |
| 23 | + end | |
| 24 | + | |
| 25 | + def call(env) | |
| 26 | + @app.call(env).on_complete do |env| | |
| 27 | + body = env[:body] | |
| 28 | + | |
| 29 | + case @unzip | |
| 30 | + when 'gzip'.freeze | |
| 31 | + body.replace(ActiveSupport::Gzip.decompress(body)) | |
| 32 | + end | |
| 33 | + | |
| 34 | + case | |
| 35 | + when @force_encoding | |
| 36 | + encoding = @force_encoding | |
| 37 | + when body.encoding == Encoding::ASCII_8BIT | |
| 38 | + # Not all Faraday adapters support automatic charset | |
| 39 | + # detection, so we do that. | |
| 40 | + case env[:response_headers][:content_type] | |
| 41 | +          when /;\s*charset\s*=\s*([^()<>@,;:\\\"\/\[\]?={}\s]+)/i | |
| 42 | + encoding = Encoding.find($1) rescue nil | |
| 43 | + when /\A\s*(?:text\/[^\s;]+|application\/(?:[^\s;]+\+)?(?:xml|json))\s*(?:;|\z)/i | |
| 44 | + encoding = @default_encoding | |
| 45 | + else | |
| 46 | + # Never try to transcode a binary content | |
| 47 | + return | |
| 48 | + end | |
| 49 | + end | |
| 50 | + body.encode!(Encoding::UTF_8, encoding) unless body.encoding == Encoding::UTF_8 | |
| 51 | + end | |
| 52 | + end | |
| 53 | + end | |
| 54 | + | |
| 55 | + Faraday::Response.register_middleware character_encoding: CharacterEncoding | |
| 56 | + | |
| 17 | 57 | extend ActiveSupport::Concern | 
| 18 | 58 |  | 
| 19 | 59 | def validate_web_request_options! | 
| @@ -34,6 +74,23 @@ module WebRequestConcern | ||
| 34 | 74 | rescue ArgumentError => e | 
| 35 | 75 | errors.add(:base, e.message) | 
| 36 | 76 | end | 
| 77 | + | |
| 78 | + if (encoding = options['force_encoding']).present? | |
| 79 | + case encoding | |
| 80 | + when String | |
| 81 | + begin | |
| 82 | + Encoding.find(encoding) | |
| 83 | + rescue ArgumentError | |
| 84 | +          errors.add(:base, "Unknown encoding: #{encoding.inspect}") | |
| 85 | + end | |
| 86 | + else | |
| 87 | + errors.add(:base, "force_encoding must be a string") | |
| 88 | + end | |
| 89 | + end | |
| 90 | + end | |
| 91 | + | |
| 92 | + def default_encoding | |
| 93 | + Encoding::UTF_8 | |
| 37 | 94 | end | 
| 38 | 95 |  | 
| 39 | 96 | def faraday | 
| @@ -44,6 +101,11 @@ module WebRequestConcern | ||
| 44 | 101 | } | 
| 45 | 102 |  | 
| 46 | 103 |      @faraday ||= Faraday.new(faraday_options) { |builder| | 
| 104 | + builder.response :character_encoding, | |
| 105 | + force_encoding: interpolated['force_encoding'].presence, | |
| 106 | + default_encoding: default_encoding, | |
| 107 | + unzip: interpolated['unzip'].presence | |
| 108 | + | |
| 47 | 109 | builder.headers = headers if headers.length > 0 | 
| 48 | 110 |  | 
| 49 | 111 | builder.headers[:user_agent] = user_agent | 
| @@ -51,7 +113,7 @@ module WebRequestConcern | ||
| 51 | 113 | builder.use FaradayMiddleware::FollowRedirects | 
| 52 | 114 | builder.request :url_encoded | 
| 53 | 115 |  | 
| 54 | - if boolify(options['disable_url_encoding']) | |
| 116 | + if boolify(interpolated['disable_url_encoding']) | |
| 55 | 117 | builder.options.params_encoder = DoNotEncoder | 
| 56 | 118 | end | 
| 57 | 119 |  | 
| @@ -29,6 +29,7 @@ module Agents | ||
| 29 | 29 | * `basic_auth` - Specify HTTP basic auth parameters: `"username:password"`, or `["username", "password"]`. | 
| 30 | 30 | * `disable_ssl_verification` - Set to `true` to disable ssl verification. | 
| 31 | 31 | * `disable_url_encoding` - Set to `true` to disable url encoding. | 
| 32 | + * `force_encoding` - Set `force_encoding` to an encoding name if the website is known to respond with a missing, invalid or wrong charset in the Content-Type header. Note that a text content without a charset is taken as encoded in UTF-8 (not ISO-8859-1). | |
| 32 | 33 |            * `user_agent` - A custom User-Agent name (default: "Faraday v#{Faraday::VERSION}"). | 
| 33 | 34 | * `max_events_per_run` - Limit number of events created (items parsed) per run for feed. | 
| 34 | 35 |  | 
| @@ -87,7 +87,7 @@ module Agents | ||
| 87 | 87 |  | 
| 88 | 88 |        Set `uniqueness_look_back` to limit the number of events checked for uniqueness (typically for performance).  This defaults to the larger of #{UNIQUENESS_LOOK_BACK} or #{UNIQUENESS_FACTOR}x the number of detected received results. | 
| 89 | 89 |  | 
| 90 | - Set `force_encoding` to an encoding name if the website does not return a Content-Type header with a proper charset. | |
| 90 | + Set `force_encoding` to an encoding name if the website is known to respond with a missing, invalid or wrong charset in the Content-Type header. Note that a text content without a charset is taken as encoded in UTF-8 (not ISO-8859-1). | |
| 91 | 91 |  | 
| 92 | 92 |        Set `user_agent` to a custom User-Agent name if the website does not like the default value (`#{default_user_agent}`). | 
| 93 | 93 |  | 
| @@ -157,19 +157,6 @@ module Agents | ||
| 157 | 157 | errors.add(:base, "Invalid uniqueness_look_back format") unless is_positive_integer?(options['uniqueness_look_back']) | 
| 158 | 158 | end | 
| 159 | 159 |  | 
| 160 | - if (encoding = options['force_encoding']).present? | |
| 161 | - case encoding | |
| 162 | - when String | |
| 163 | - begin | |
| 164 | - Encoding.find(encoding) | |
| 165 | - rescue ArgumentError | |
| 166 | -            errors.add(:base, "Unknown encoding: #{encoding.inspect}") | |
| 167 | - end | |
| 168 | - else | |
| 169 | - errors.add(:base, "force_encoding must be a string") | |
| 170 | - end | |
| 171 | - end | |
| 172 | - | |
| 173 | 160 | validate_web_request_options! | 
| 174 | 161 | end | 
| 175 | 162 |  | 
| @@ -284,12 +271,6 @@ module Agents | ||
| 284 | 271 |        interpolation_context.stack { | 
| 285 | 272 | interpolation_context['_response_'] = ResponseDrop.new(response) | 
| 286 | 273 | body = response.body | 
| 287 | - if (encoding = interpolated['force_encoding']).present? | |
| 288 | - body = body.encode(Encoding::UTF_8, encoding) | |
| 289 | - end | |
| 290 | - if interpolated['unzip'] == "gzip" | |
| 291 | - body = ActiveSupport::Gzip.decompress(body) | |
| 292 | - end | |
| 293 | 274 | doc = parse(body) | 
| 294 | 275 |  | 
| 295 | 276 | if extract_full_json? |